# Read in main data set

data <- read.csv("raw/schma19962016.csv") %>% 
  clean_names()

addon <- read_csv("raw/stchso20052016.csv") %>% 
  clean_names()
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   DBN = col_character(),
##   SCHNAM = col_character(),
##   YEAR = col_double(),
##   XDBN = col_character(),
##   XDBN_FLAG = col_double(),
##   BOROUGH = col_character(),
##   DISTRICT = col_character(),
##   BN = col_character(),
##   BNLONG_NUMDIST = col_double(),
##   BNLONG_BNDUP = col_double(),
##   BNLONG_DISTDUP = col_double(),
##   BNLONG = col_character(),
##   BNLONG_FIXED = col_double(),
##   BNLONG_YRFIRST = col_double(),
##   BNLONG_YRLAST = col_double(),
##   BNLONG_TOTALYRS = col_double(),
##   BNLONG_PHASEDOUT19962016 = col_double(),
##   BNLONG_PHASEDIN19962016 = col_double(),
##   BNLONG_CONTOPERATE19962016 = col_double(),
##   HSOSCHNAM = col_character()
##   # ... with 31 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1559679 parsing failures.
##  row             col           expected actual                     file
## 1447 STCELANUMTSTG03 1/0/T/F/TRUE/FALSE    37  'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG04 1/0/T/F/TRUE/FALSE    46  'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG05 1/0/T/F/TRUE/FALSE    28  'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG06 1/0/T/F/TRUE/FALSE    36  'raw/stchso20052016.csv'
## 1447 STCELANUMTSTTOT 1/0/T/F/TRUE/FALSE    147 'raw/stchso20052016.csv'
## .... ............... .................. ...... ........................
## See problems(...) for more details.
#Load in collected data about new and failing schools
old_new_schools <- read_excel("raw/oldnewssc.xlsx")

# Create a list of schools so that I can see how they are named in this data

schools <- data %>% 
  select(schnam) %>% 
  distinct(schnam) %>% 
  arrange(schnam)

# Create a list of NYC's specialized schools

selective_schools <- c("BRONX HIGH SCHOOL OF SCIENCE", "BROOKLYN TECHNICAL HIGH SCHOOL", "STATEN ISLAND TECHNICAL HS", "STUYVESANT HIGH SCHOOL", "BROOKLYN LATIN, THE", "QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEG", "HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINEERI", "HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE")

# Create a dataset that filters out observations for selective schools

data_selective_schools <- data %>% 
  filter(schnam %in% selective_schools)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   DBN = col_character(),
##   Name = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 6 parsing failures.
##  row        col expected actual                                                                                 file
## 1124 fl_percent a double    n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 2154 fl_percent a double    n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 2161 fl_percent a double    n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 4299 fl_percent a double    n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 8523 fl_percent a double    n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## .... .......... ........ ...... ....................................................................................
## See problems(...) for more details.
## # A tibble: 2,322 x 39
##    dbn   name  schoolyear fl_percent frl_percent total_enrollment  prek     k
##    <chr> <chr>      <dbl>      <dbl>       <dbl>            <dbl> <dbl> <dbl>
##  1 01M2… HENR…       2008       99.5        NA                515    NA    NA
##  2 01M2… HENR…       2009       95.3        NA                470    NA    NA
##  3 01M2… HENR…       2010       NA          69.9              511    NA    NA
##  4 01M2… HENR…       2011       NA          73.3              448    NA    NA
##  5 01M2… HENR…       2012       NA          88.6              422    NA    NA
##  6 01M4… UNIV…       2006       71.5        NA                478    NA    NA
##  7 01M4… UNIV…       2007       71.5        NA                533    NA    NA
##  8 01M4… UNIV…       2008       71.5        NA                588    NA    NA
##  9 01M4… UNIV…       2009       60.9        NA                586    NA    NA
## 10 01M4… UNIV…       2010       NA          73.4              532    NA    NA
## # … with 2,312 more rows, and 31 more variables: grade1 <dbl>, grade2 <dbl>,
## #   grade3 <dbl>, grade4 <dbl>, grade5 <dbl>, grade6 <dbl>, grade7 <dbl>,
## #   grade8 <dbl>, grade9 <dbl>, grade10 <dbl>, grade11 <dbl>, grade12 <dbl>,
## #   ell_num <dbl>, ell_percent <dbl>, sped_num <dbl>, sped_percent <dbl>,
## #   ctt_num <dbl>, selfcontained_num <dbl>, asian_num <dbl>, asian_per <dbl>,
## #   black_num <dbl>, black_per <dbl>, hispanic_num <dbl>, hispanic_per <dbl>,
## #   white_num <dbl>, white_per <dbl>, male_num <dbl>, male_per <dbl>,
## #   female_num <dbl>, female_per <dbl>, total_hs_enrollment <dbl>
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   DBN = col_character(),
##   `School Name` = col_character(),
##   Year = col_character(),
##   `% Female` = col_character(),
##   `% Male` = col_character(),
##   `% Asian` = col_character(),
##   `% Black` = col_character(),
##   `% Hispanic` = col_character(),
##   `% Other` = col_character(),
##   `% White` = col_character(),
##   `% Students with Disabilities` = col_character(),
##   `% English Language Learners` = col_character(),
##   `% Poverty` = col_character()
## )
## See spec(...) for full column specifications.
# Filter for high schools (defined as schools serving 9-12 at least)  

data_hs <- data %>% 
  filter(enrnumg12 != 0 & enrnumg11 != 0 & enrnumg10  != 0 & enrnumg09 != 0 ) %>% 
  group_by(year) %>% 
  mutate(percent_rank = percent_rank(hsopcthsgtot))

# Plot all schools by year and graduation rates

data_hs %>% 
  filter(hsopcthsgtot != is.na(hsopcthsgtot)) %>% 
  select(schnam, year, hsopcthsgtot) %>% 
  ggplot(aes(x = year, y = hsopcthsgtot, color = schnam)) + 
  geom_jitter() +
  theme(legend.position = "none")

# Plot percentile rank of school over time

data_hs %>% 
  filter(schnam %in% selective_schools) %>% 
  ggplot(aes(x= year, y = percent_rank, color = schnam)) +
  geom_line() +
  theme(legend.position = "none")
## Warning: Removed 79 rows containing missing values (geom_path).

# Find out valid observations of graduation rates per year

data %>% 
  group_by(year) %>% 
  filter(hsopctdchtot != is.na(hsopcthsgtot)) %>% 
  select(schnam, year, hsopcthsgtot) %>% 
  count()
## # A tibble: 9 x 2
## # Groups:   year [9]
##    year     n
##   <int> <int>
## 1  2000     4
## 2  2001     6
## 3  2002    55
## 4  2003    57
## 5  2004   214
## 6  2005   213
## 7  2006   223
## 8  2007   276
## 9  2008   329

# Join the three data sets
# Because demo2006-2016 is missing 2005 data. I pulled those numbers from SCHMA
# demo2006-2016 has more reliable data, excpt for 2005

data_join <- data_hs %>% 
  full_join(grad_rates_clean, by = c("dbn", "year" = "grad_year")) %>% 
  full_join(demo_20062016, by = c("dbn", "year" = "schoolyear")) %>% 
  mutate(total_hs_enrollment_schnam = enrnumg09 + enrnumg10 + enrnumg11 + enrnumg12) %>% 
  mutate(total_hs_enrollment = coalesce(total_hs_enrollment, as.double(total_hs_enrollment_schnam)))
## Warning: Column `dbn` joining factor and character vector, coercing into
## character vector
# tool to test commands
#data_join %>% 
 # select(dbn, schnam, year, total_hs_enrollment, total_hs_enrollment_schnam) %>% 
 # arrange(dbn, year) %>% 
 # mutate(total_hs_enrollment = coalesce(total_hs_enrollment, as.double(total_hs_enrollment_schnam)))
## Adding missing grouping variables: `year`
## Warning: Removed 189 rows containing missing values (geom_path).

## Warning: Removed 689 rows containing missing values (geom_path).

## Adding missing grouping variables: `year`
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## Warning: Removed 23 rows containing non-finite values (stat_smooth).
## Warning: Removed 23 rows containing missing values (geom_point).

# Plot a line graph showing the relationship between year and graduation rates
# per type of school

replacement_schools_grad_vs_failing <- data_join %>% 
  filter(dbn %in% replacement_schools |
         dbn %in% large_failing_schools |
         dbn %in% failing_2005_large_open_shrank_list |
         dbn %in% failing_2005_large_closed_list |
         dbn %in% failing_2005_large_open_2015_list) %>% 
  
  # Did this to see if there was any pre-2005 data on large failing schools but seems like there isn't
  #mutate(total_grads_percent_of_cohort = coalesce(total_grads_percent_of_cohort, hsopcthsgnyc/100)) %>% 
  select(dbn, year, total_grads_percent_of_cohort) %>% 
  arrange(dbn, year) %>% 
  mutate(data_cat = ifelse(dbn %in% large_failing_schools,
                           "Announced for Closure in 2005",
                              ifelse(dbn %in% replacement_schools,
                               "Small Replacement Schools opened by 2005",
                                 ifelse(dbn %in% failing_2005_large_closed_list,
                                      "Large Eventually Closed",
                                      ifelse(dbn %in% failing_2005_large_open_shrank_list,
                                             "Large and Shrank",
                                             "Large and Operating"))))) %>% 
  
  # Picked 2008 as an arbitrary date, just to show what graduation rates were like 
  # for schools that were closed
  filter(!dbn %in% large_failing_schools | year <= 2008) %>% 
  
  # Started at '06 because 2005 had a small sample size of all the schools
  # Data was also kinda wonky but not sure if that's a good reason for excluding
  filter(!dbn %in% replacement_schools | year >= 2006)

# Plot it
replacement_schools_grad_vs_failing %>% 
  ggplot(aes(x = year, y = total_grads_percent_of_cohort, color = data_cat)) +
  geom_smooth(method = "lm",  se = FALSE)
## Warning: Removed 493 rows containing non-finite values (stat_smooth).

replacement_schools_grad_vs_failing %>% 
  group_by(year, data_cat) %>% 
  summarize(avg_grad = mean(total_grads_percent_of_cohort, na.rm =TRUE)) %>% 
  ggplot(aes(x = year, y = avg_grad, color = data_cat)) +
  geom_point() +
  geom_line() +
  xlim(2005, 2015) +
  ylim(0.25, 0.8)
## Warning: Removed 39 rows containing missing values (geom_point).
## Warning: Removed 39 rows containing missing values (geom_path).

data_join %>% 
  filter(dbn == "29Q272") %>% 
  select(dbn, school, total_grads_percent_of_cohort, hsonumcohtot, hsonumcohnyc, hsonumhsgtot)
## Adding missing grouping variables: `year`
## # A tibble: 12 x 7
## # Groups:   year [12]
##     year dbn    school   total_grads_per… hsonumcohtot hsonumcohnyc hsonumhsgtot
##    <dbl> <chr>  <chr>               <dbl>        <int>        <int>        <int>
##  1  2005 29Q272 GEORGE …            1               24           21           21
##  2  2006 29Q272 GEORGE …            0.737           43           39           28
##  3  2007 29Q272 GEORGE …            0.589          127           97           54
##  4  2008 29Q272 GEORGE …            0.651           97           65           42
##  5  2009 29Q272 GEORGE …            0.766           NA           79           60
##  6  2010 29Q272 GEORGE …            0.853           NA          103           88
##  7  2011 29Q272 GEORGE …            0.826           NA          111           91
##  8  2012 29Q272 GEORGE …            0.769           NA          110           84
##  9  2013 29Q272 GEORGE …            0.826           NA           95           76
## 10  2014 29Q272 GEORGE …            0.838           NA          112           94
## 11  2015 29Q272 GEORGE …            0.759           NA          137          101
## 12  2016 29Q272 <NA>               NA               NA           NA           NA

# Create dataset that describes 20,50,80 percentiles of regents/cohort per year
grad_rates_regents_cohort <- grad_rates_clean %>% 
  group_by(cohort_year) %>% 
  nest() %>% 
  mutate(percentile_20 = map_dbl(data,  ~ quantile(.[[9]], probs = .2)),
         percentile_80 = map_dbl(data,  ~ quantile(.[[9]], probs = .8)),
         percentile_50 = map_dbl(data,  ~ quantile(.[[9]], probs = .5))) %>% 
  arrange(cohort_year) %>% 
  select(-data)

#Graph the median result with errorbars at 80 and 20
grad_rates_regents_cohort %>% 
  ggplot(aes(x = cohort_year, y = percentile_50)) +
  geom_point() +
  geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

# Create dataset that describes 20,50,80 percentiles of regents/grads per year
grad_rates_regents_grads <- grad_rates_clean %>% 
  group_by(cohort_year) %>% 
  nest() %>% 
  mutate(percentile_20 = map_dbl(data,  ~ quantile(.[[10]], probs = .2, na.rm = TRUE)),
         percentile_80 = map_dbl(data,  ~ quantile(.[[10]], probs = .8, na.rm = TRUE)),
         percentile_50 = map_dbl(data,  ~ quantile(.[[10]], probs = .5, na.rm = TRUE))) %>% 
  arrange(cohort_year) %>% 
  select(-data)

#Graph the median result with errorbars at 80 and 20
grad_rates_regents_grads %>% 
  ggplot(aes(x = cohort_year, y = percentile_50)) +
  geom_point() +
  geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

# Create dataset that describes 20,50,80 percentiles of adv regents/cohort
grad_rates_advregents_cohort <- grad_rates_clean %>% 
  group_by(cohort_year) %>% 
  nest() %>% 
  mutate(percentile_20 = map_dbl(data,  ~ quantile(.[[13]], probs = .2, na.rm = TRUE)),
         percentile_80 = map_dbl(data,  ~ quantile(.[[13]], probs = .8, na.rm = TRUE)),
         percentile_50 = map_dbl(data,  ~ quantile(.[[13]], probs = .5, na.rm = TRUE))) %>% 
  arrange(cohort_year) %>% 
  select(-data)

#Graph the median result with errorbars at 80 and 20
grad_rates_advregents_cohort %>% 
  ggplot(aes(x = cohort_year, y = percentile_50)) +
  geom_point() +
  geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

## # A tibble: 11 x 2
## # Groups:   cohort_year [11]
##    cohort_year     n
##          <dbl> <int>
##  1        2001   214
##  2        2002   246
##  3        2003   276
##  4        2004   320
##  5        2005   353
##  6        2006   367
##  7        2007   383
##  8        2008   404
##  9        2009   419
## 10        2010   431
## 11        2011   441